Rows: 17379 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Date
dbl (13): Season, Hour, Holiday, Day of the Week, Working Day, Weather Type,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plotts.sample.wge(bike_data$`Total Users`)$xbar
[1] 189.4631
# make the bike data daily instead of hourly daily_bike_data <- bike_data %>%group_by(Date =as.Date(Date, format ="%m/%d/%Y")) %>%summarise(Season =mean(Season),Hour =mean(Hour),Holiday =mean(Holiday),Day_of_the_Week =mean(`Day of the Week`),Working_Day =mean(`Working Day`),Weather_Type =mean(`Weather Type`),Temperature =mean(`Temperature F`),Temperature_Feels =mean(`Temperature Feels F`),Humidity =mean(Humidity),Wind_Speed =mean(`Wind Speed`),Casual_Users =sum(`Casual Users`),Registered_Users =sum(`Registered Users`),Total_Users =sum(`Total Users`), )plotts.sample.wge(daily_bike_data$Total_Users)$xbar
[1] 4504.349
Models
Univariate: At least 2 candidate ARMA / ARIMA models
Seasonal Model
# create model plotts.sample.wge(daily_bike_data$Total_Users)$xbar # looking at the ACF's there seems to be a seasonality of 2
[1] 4504.349
factor.wge(phi =c(0,1))
Coefficients of AR polynomial:
0.0000 1.0000
AR Factor Table
Factor Roots Abs Recip System Freq
1+1.0000B -1.0000 1.0000 0.5000
1-1.0000B 1.0000 1.0000 0.0000
est.arma.wge(daily_bike_data$Total_Users, p =15) # looking at the overfit tables there's evidence of seasonality of 2
aic5.wge(diff,type ='bic') # bic picked p = 2 and q = 2
---------WORKING... PLEASE WAIT...
Five Smallest Values of bic
p q bic
4 2 13.73395
5 2 13.75966
5 1 13.86287
4 0 13.86295
5 0 13.86810
seasonal_est =est.arma.wge(diff, p =2, q =2)
Coefficients of AR polynomial:
0.3610 -0.0389
AR Factor Table
Factor Roots Abs Recip System Freq
1-0.3610B+0.0389B^2 4.6361+-2.0477i 0.1973 0.0662
Coefficients of MA polynomial:
-0.1184 0.8816
MA FACTOR TABLE
Factor Roots Abs Recip System Freq
1+1.0000B -1.0000 1.0000 0.5000
1-0.8816B 1.1343 0.8816 0.0000
# Checking the residuals to see if the model is appropriate plotts.wge(seasonal_est$res) # looks randomacf(seasonal_est$res) # all the acfs are within the bounds ljung.wge(seasonal_est$res) # greater than 0.05
seasonal_gen1 =gen.arima.wge(200, phi = seasonal_est$phi, theta = seasonal_est$theta, s =2)seasonal_gen2 =gen.arima.wge(200, phi = seasonal_est$phi, theta = seasonal_est$theta, s =2)
seasonal_gen3 =gen.arima.wge(200, phi = seasonal_est$phi, theta = seasonal_est$theta, s =2)seasonal_gen4 =gen.arima.wge(200, phi = seasonal_est$phi, theta = seasonal_est$theta, s =2)plotts.sample.wge(seasonal_gen1)$xbar
[1] -2.034474
plotts.sample.wge(seasonal_gen2)$xbar
[1] -12.59861
plotts.sample.wge(seasonal_gen3)$xbar
[1] -10.57343
plotts.sample.wge(seasonal_gen4)$xbar # all these 4 random realizations have the same behavior as the original data so the model seems to be appropriate
[1] 1.590323
# Compare Spectral Densities: sims =20SpecDen =parzen.wge(daily_bike_data$Total_Users, plot ="FALSE")plot(SpecDen$freq,SpecDen$pzgram, type ="l", lwd =6)for( i in1: sims){ SpecDen2 =parzen.wge(gen.aruma.wge(length(daily_bike_data$Total_Users), s =2, phi = seasonal_est$phi, theta = seasonal_est$theta, plot ="FALSE"), plot ="FALSE")lines(SpecDen2$freq,SpecDen2$pzgram, lwd =2, col ="red")}
# Compare ACFs:sims =20ACF =acf(daily_bike_data$Total_Users, plot ="FALSE")plot(ACF$lag ,ACF$acf , type ="l", lwd =6)for( i in1: sims){ ACF2 =acf(gen.aruma.wge(length(daily_bike_data$Total_Users), s =2, phi = seasonal_est$phi, theta = seasonal_est$theta, plot ="FALSE"), plot ="FALSE")lines(ACF2$lag ,ACF2$acf, lwd =2, col ="red")}
# plot of Last N forecasts for short and long term horizon seasonal_stfore1 =fore.arima.wge(daily_bike_data$Total_Users, phi = seasonal_est$phi, theta = seasonal_est$theta, s =2, n.ahead =7, lastn =TRUE)
# ASE seasonal_stASE =mean((daily_bike_data$Total_Users[725:731]-seasonal_stfore1$f)^2)seasonal_ltASE =mean((daily_bike_data$Total_Users[672:731]-seasonal_ltfore1$f)^2)seasonal_stASE
[1] 4368663
seasonal_ltASE
[1] 4042180
# Rolling Window RMSE# RW-RMSE commented out due to obscene amount of unsupressable output# seasonal_strwRMSE = roll.win.rmse.wge(daily_bike_data$Total_Users, horizon = 7, phi = seasonal_est$phi, theta = seasonal_est$theta, s = 2)$rwRMSE# seasonal_ltrwRMSE = roll.win.rmse.wge(daily_bike_data$Total_Users, horizon = 60, phi = seasonal_est$phi, theta = seasonal_est$theta, s = 2)$rwRMSEseasonal_strwRMSE =1253.353seasonal_ltrwRMSE =1504.218seasonal_stfore2 =fore.arima.wge(daily_bike_data$Total_Users, phi = seasonal_est$phi, theta = seasonal_est$theta, s =2, n.ahead =7)
# Plots of the short and Long Term Forecasts t =1:800plot(t[670:731],daily_bike_data$Total_Users[670:731], type ='l', main ="Short Term Forecast", xlim =c(670,795), xlab ="Time", ylab ="Total Users")points(t[732:738],seasonal_stfore2$f, type ='l', col ='blue')points(t[732:738],seasonal_stfore2$ll, type ='l',lwd=2, lty =2, col ='red')points(t[732:738],seasonal_stfore2$ul, type ='l',lwd=2, lty =2, col ='red')
plot(t[670:731],daily_bike_data$Total_Users[670:731], type ='l', main ="Long Term Forecast", xlim =c(670,795), xlab ="Time", ylab ="Total Users")points(t[732:791],seasonal_ltfore2$f, type ='l', col ='blue')points(t[732:791],seasonal_ltfore2$ll, type ='l',lwd=2, lty =2, col ='red')points(t[732:791],seasonal_ltfore2$ul, type ='l',lwd=2, lty =2, col ='red')
Non-Seasonal ARIMA Model
# Check if the data could be stationaryadf.test(daily_bike_data$Total_Users) # p-value 0.7, data not likely stationary
Augmented Dickey-Fuller Test
data: daily_bike_data$Total_Users
Dickey-Fuller = -1.6351, Lag order = 9, p-value = 0.7327
alternative hypothesis: stationary
# Difference of 1total_d1 =artrans.wge(daily_bike_data$Total_Users, phi.tr =c(1))
# Model the residualsaic5.wge(total_d1, type ='bic') # bic and aic pick p = 1 and q = 1
---------WORKING... PLEASE WAIT...
Five Smallest Values of bic
p q bic
1 1 13.68620
0 2 13.69304
2 1 13.69387
1 2 13.69455
3 1 13.69895
est =est.arma.wge(total_d1, p =1, q =1)
Coefficients of AR polynomial:
0.3591
AR Factor Table
Factor Roots Abs Recip System Freq
1-0.3591B 2.7848 0.3591 0.0000
Coefficients of MA polynomial:
0.8903
MA FACTOR TABLE
Factor Roots Abs Recip System Freq
1-0.8903B 1.1232 0.8903 0.0000
plotts.sample.wge(est$res, arlimits =TRUE)$xbar # appears to be white noise
# Generate realizations for comparisonarima_gen1 =gen.arima.wge(200, phi = est$phi, theta = est$theta, d =1)
arima_gen2 =gen.arima.wge(200, phi = est$phi, theta = est$theta, d =1)
arima_gen3 =gen.arima.wge(200, phi = est$phi, theta = est$theta, d =1)
arima_gen4 =gen.arima.wge(200, phi = est$phi, theta = est$theta, d =1)
plotts.sample.wge(arima_gen1)$xbar
[1] 11.9336
plotts.sample.wge(arima_gen2)$xbar
[1] -3.912301
plotts.sample.wge(arima_gen3)$xbar
[1] -11.69953
plotts.sample.wge(arima_gen4)$xbar # Similar behavior to original data
[1] -0.9305175
# Compare Spectral Densities: sims =20SpecDen =parzen.wge(daily_bike_data$Total_Users, plot ="FALSE")plot(SpecDen$freq,SpecDen$pzgram, type ="l", lwd =6)for( i in1: sims){ SpecDen2 =parzen.wge(gen.aruma.wge(length(daily_bike_data$Total_Users), d =1, phi = est$phi, theta = est$theta, plot ="FALSE"), plot ="FALSE")lines(SpecDen2$freq,SpecDen2$pzgram, lwd =2, col ="red")}
# Compare ACFs:sims =20ACF =acf(daily_bike_data$Total_Users, plot ="FALSE")plot(ACF$lag ,ACF$acf , type ="l", lwd =6)for( i in1: sims){ ACF2 =acf(gen.aruma.wge(length(daily_bike_data$Total_Users), d =1, phi = est$phi, theta = est$theta, plot ="FALSE"), plot ="FALSE")lines(ACF2$lag ,ACF2$acf, lwd =2, col ="red")}
# Short and Long Term Forecastsfore_arima_st =fore.arima.wge(daily_bike_data$Total_Users, phi = est$phi, theta = est$theta, d =1, n.ahead =7, lastn =TRUE)
# ASE arima_stASE =mean((daily_bike_data$Total_Users[725:731]-fore_arima_st$f)^2)arima_ltASE =mean((daily_bike_data$Total_Users[672:731]-fore_arima_lt$f)^2)arima_stASE
[1] 3230238
arima_ltASE
[1] 3930824
# Rolling Window RMSE# RW-RMSE commented out due to obscene amount of unsupressable output# arima_strwRMSE = roll.win.rmse.wge(daily_bike_data$Total_Users, horizon = 7, phi = est$phi, theta = est$theta, d = 1)$rwRMSE# arima_ltrwRMSE = roll.win.rmse.wge(daily_bike_data$Total_Users, horizon = 60, phi = est$phi, theta = est$theta, d = 1)$rwRMSEarima_strwRMSE =1237.393arima_ltrwRMSE =1503.197# Plots of the short and Long Term Forecasts fore_arima_st_2 =fore.arima.wge(daily_bike_data$Total_Users, phi = est$phi, theta = est$theta, d =1, n.ahead =7)
t =1:800plot(t[670:731],daily_bike_data$Total_Users[670:731], type ='l', main ="Short Term Forecast", xlim =c(670,795), xlab ="Time", ylab ="Total Users")points(t[732:738],fore_arima_st_2$f, type ='l', col ='blue')points(t[732:738],fore_arima_st_2$ll, type ='l',lwd=2, lty =2, col ='red')points(t[732:738],fore_arima_st_2$ul, type ='l',lwd=2, lty =2, col ='red')
plot(t[670:731],daily_bike_data$Total_Users[670:731], type ='l', main ="Long Term Forecast", xlim =c(670,795), xlab ="Time", ylab ="Total Users")points(t[732:791],fore_arima_lt_2$f, type ='l', col ='blue')points(t[732:791],fore_arima_lt_2$ll, type ='l',lwd=2, lty =2, col ='red')points(t[732:791],fore_arima_lt_2$ul, type ='l',lwd=2, lty =2, col ='red')
The models in factored form or at least separate the stationary and non- stationary factors with standard deviation or variance of the white noise.
AIC
ASE (short and long term forecasts)
Rolling Window RMSE (short and long term forecasts)
At least 10 superimposed spectral densities from 10 generated realizations like we did in Unit 11. Use these to help choose between the at least two candidate univariate models.
Visualization of Forecasts for both the short- and long-term Horizons.
Be sure and include confidence intervals when possible (I don’t have code for confidence intervals from MLP models at the moment… but that would be a good thing to work on! )
Multivariate: At least one multivariate model (VAR or MLR with Correlated Errors)
Include an ASE (rolling window is not yet available in multivariate models)
Short Horizon (you pick the length.. could be one step ahead)
Long Horizon (you pick … just must be longer than the short horizon.)
Describe the explanatory variable(s) used in the model and why you felt they were significant / important.
Visualization of Forecasts for both the short- and long-Horizons.
Be sure and include confidence intervals if using VAR …
ccf(daily_bike_data$Total_Users,daily_bike_data$Humidity) # lag of 5
daily_bike_data$Humidity_lag = dplyr::lag(daily_bike_data$Humidity,5)fit =lm(Total_Users~Humidity_lag + Temperature + Temperature_Feels + Day_of_the_Week + Wind_Speed, data = daily_bike_data)plotts.sample.wge(fit$residuals)$xbar
[1] 9.503264e-15
aic5.wge(fit$residuals, p =0:10, q =0:5, type ='bic')
---------WORKING... PLEASE WAIT...
Five Smallest Values of bic
p q bic
2 1 13.57138
1 3 13.57973
3 1 13.58033
2 2 13.58080
4 1 13.58595
mlr =arima(daily_bike_data$Total_Users, order =c(6,0,1),xreg =cbind(daily_bike_data$Humidity_lag, daily_bike_data$Temperature, daily_bike_data$Temperature_Feels, daily_bike_data$Day_of_the_Week, daily_bike_data$Wind_Speed))
Warning in log(s2): NaNs produced
Warning in arima(daily_bike_data$Total_Users, order = c(6, 0, 1), xreg =
cbind(daily_bike_data$Humidity_lag, : possible convergence problem: optim gave
code = 1
plotts.wge(mlr$residuals) # looks random
acf(mlr$residuals[-(1:5)]) # only 1/20 acfs out of bounds
Warning in summary.lm(x): essentially perfect fit: summary may be unreliable
t =1:800plot(seq(1,731,1),daily_bike_data$Total_Users,type ='l',xlim =c(670,738), main ="Short Term VAR Forecast")points(t[732:738], var_st_pred2$fcst$Total_Users[,1], type="l", lwd=2, lty =1, col ='blue')
Warning in summary.lm(x): essentially perfect fit: summary may be unreliable
t =1:800plot(seq(1,731,1),daily_bike_data$Total_Users,type ='l',xlim =c(670,791), main ="Long Term VAR Forecast")points(t[732:791], var_lt_pred2$fcst$Total_Users[,1], type="l", lwd=2, lty =1, col ='blue')
MLP Model
ASE (short and long term forecasts)
Rolling Window RMSE (short and long term forecasts (only if univariate)
Visualization of Forecasts for both the short- and long-term Horizons.
Confidence / Prediction intervals are not required (I don’t have code for confidence / prediction intervals (bootstrap intervals) for MLP models at the moment… but that would be a good thing to work on! )
# Convert tibble to a data frame with ts() objectsbike_DF <- daily_bike_data %>% dplyr::select(-c(Date, Hour, Holiday, Temperature_Feels, Casual_Users, Registered_Users)) %>%# Remove unnecessary columnsmutate(across(everything(), ~zoo(.x, order.by = daily_bike_data$Date))) %>%# Convert each column to a zoo objectmutate(across(everything(), ~as.ts(.x))) %>%# Convert zoo objects to ts objectsas.data.frame() # Convert the tibble to a data framebikeShortTrain = bike_DF[1:724,]bikeShortTest = bike_DF[725:731,]bikeLongTrain = bike_DF[1:671,]bikeLongTest = bike_DF[672:731,]seed =137
# Forecast Short-term predictor variables using MLPset.seed(seed)fit.mlp.short.Season =mlp(ts(bikeShortTrain[,"Season"]),reps =10, difforder =0, comb ="mean", det.type ="bin")fore.mlp.short.Season =forecast(fit.mlp.short.Season, h =7)plot(fore.mlp.short.Season)
fit.mlp.short.Day_of_the_Week =mlp(ts(bikeShortTrain[,"Day_of_the_Week"]),reps =10, difforder =0, comb ="mean", det.type ="bin")fore.mlp.short.Day_of_the_Week =forecast(fit.mlp.short.Day_of_the_Week, h =7)fore.mlp.short.Day_of_the_Week$mean =round(fore.mlp.short.Day_of_the_Week$mean) # Round to nearest whole numberplot(fore.mlp.short.Day_of_the_Week)
fit.mlp.short.Working_Day =mlp(ts(bikeShortTrain[,"Working_Day"]),reps =10, difforder =0, comb ="mean", det.type ="bin")fore.mlp.short.Working_Day =forecast(fit.mlp.short.Working_Day, h =7)fore.mlp.short.Working_Day$mean =round(fore.mlp.short.Working_Day$mean) # Round to nearest whole numberplot(fore.mlp.short.Working_Day)
# Pack into data framesBDF_fore_short =data.frame(Day =ts(seq(1,731,1)),Season =ts(c(bikeShortTrain$Season,fore.mlp.short.Season$mean)),Day_of_the_Week =ts(c(bikeShortTrain$Day_of_the_Week,fore.mlp.short.Day_of_the_Week$mean)),Working_Day =ts(c(bikeShortTrain$Working_Day,fore.mlp.short.Working_Day$mean)),# Weather_Type = ts(c(bikeShortTrain$Weather_Type,fore.mlp.short.Weather_Type$mean)),Temperature =ts(c(bikeShortTrain$Temperature,fore.mlp.short.Temperature$mean)),Humidity =ts(c(bikeShortTrain$Humidity,fore.mlp.short.Humidity$mean)),Wind_Speed =ts(c(bikeShortTrain$Wind_Speed,fore.mlp.short.Wind_Speed$mean)))BDF_fore_long =data.frame(Day =ts(seq(1,731,1)),Season =ts(c(bikeLongTrain$Season,fore.mlp.long.Season$mean)),Day_of_the_Week =ts(c(bikeLongTrain$Day_of_the_Week,fore.mlp.long.Day_of_the_Week$mean)),Working_Day =ts(c(bikeLongTrain$Working_Day,fore.mlp.long.Working_Day$mean)),# Weather_Type = ts(c(bikeLongTrain$Weather_Type,fore.mlp.long.Weather_Type$mean)),Temperature =ts(c(bikeLongTrain$Temperature,fore.mlp.long.Temperature$mean)),Humidity =ts(c(bikeLongTrain$Humidity,fore.mlp.long.Humidity$mean)),Wind_Speed =ts(c(bikeLongTrain$Wind_Speed,fore.mlp.long.Wind_Speed$mean)))BDF_xreg_short =data.frame(Day =ts(seq(1,724,1)),Season =ts(c(bikeShortTrain$Season)),Day_of_the_Week =ts(c(bikeShortTrain$Day_of_the_Week)),Working_Day =ts(c(bikeShortTrain$Working_Day)),# Weather_Type = ts(c(bikeShortTrain$Weather_Type)),Temperature =ts(c(bikeShortTrain$Temperature)),Humidity =ts(c(bikeShortTrain$Humidity)),Wind_Speed =ts(c(bikeShortTrain$Wind_Speed)))BDF_xreg_long =data.frame(Day =ts(seq(1,671,1)),Season =ts(c(bikeLongTrain$Season)),Day_of_the_Week =ts(c(bikeLongTrain$Day_of_the_Week)),Working_Day =ts(c(bikeLongTrain$Working_Day)),# Weather_Type = ts(c(bikeLongTrain$Weather_Type)),Temperature =ts(c(bikeLongTrain$Temperature)),Humidity =ts(c(bikeLongTrain$Humidity)),Wind_Speed =ts(c(bikeLongTrain$Wind_Speed)))# Fit MLP model for Total_Usersset.seed(seed)fit.mlp.st =mlp(ts(bikeShortTrain$Total_Users),reps =10, comb ="median",xreg = BDF_xreg_short, allow.det.season =TRUE, det.type ="bin")fit.mlp.lt =mlp(ts(bikeLongTrain$Total_Users),reps =10, comb ="median",xreg = BDF_xreg_long, allow.det.season =TRUE, det.type ="bin")# Forecast and evaluate ASE for short and longfore.mlp.st =forecast(fit.mlp.st, h =7, xreg = BDF_fore_short)fore.mlp.lt =forecast(fit.mlp.lt, h =60, xreg = BDF_fore_long)ASE.mlp.st =mean((bikeShortTest$Total_Users - fore.mlp.st$mean)^2)ASE.mlp.lt =mean((bikeLongTest$Total_Users - fore.mlp.lt$mean)^2)print(paste("MLP ASE Short Term: ", round(ASE.mlp.st, 2)))
[1] "MLP ASE Short Term: 4177344.82"
print(paste("MLP ASE Long Term: ", round(ASE.mlp.lt, 2)))
[1] "MLP ASE Long Term: 10000456.14"
plot(fore.mlp.st)
plot(fore.mlp.lt)
# Plot the forecastst =1:731plot(t[720:731],bike_DF$Total_Users[720:731], type ='l', xlab ="Time", ylab ="Total Users", main ="MLP Short Term Forecast")lines(t[725:731], fore.mlp.st$mean, type="l", lwd=2, lty =2, col ='blue')
plot(t[600:731],bike_DF$Total_Users[600:731], type ='l', xlab ="Time", ylab ="Total Users", main ="MLP Long Term Forecast")lines(t[672:731], fore.mlp.lt$mean, type="l", lwd=2, lty =2, col ='blue')
Ensemble Model
# Code for Figure 11.37# TODO: Adopt this for ours#Ensemble#VAR p = 7 non seasonalCMortVAR7 =VAR(cardiacTrain, type ="both", p =7) #p = 2 from SBCpreds7=predict(CMortVAR7,n.ahead=156)RMSEVAR7 =sqrt(mean((cardiacTest[,"cmort"] - preds7$fcst$cmort[,1])^2))RMSEVAR7 # 6.664#VAR p = 2 seasonalCMortVAR2S =VAR(cardiacTrain, season =52, type ="both", p =2) #p = 2 from SBCpreds2S=predict(CMortVAR2S,n.ahead=156)ensemble = (preds2S$fcst$cmort[,1] + fore.mlp.cmort$mean)/2#Plotplot(seq(1,508,1), cardiac[,"cmort"], type ="l",xlim =c(350,508), ylim =c(70,110), xlab ="Time", ylab ="Cardiac Mortality", main ="52 Week Cardiac Mortality Forecast From A VAR/MLP Ensemble")lines(seq(353,508,1), ensemble, type ="l", lwd =4, col ="green")lines(seq(353,508,1),preds2S$fcst$cmort[,1] , type ="l", lwd =2, lty =2, col ="red")lines(seq(353,508,1),fore.mlp.cmort$mean , type ="l", lwd =2, lty =4, col ="blue")lines(seq(353,508,1),preds7$fcst$cmort[,1] , type ="l", lwd =2, lty =2, col ="purple")RMSEVAR7 =sqrt(mean((cardiacTest[,"cmort"] - preds7$fcst$cmort[,1])^2))RMSEVAR7RMSEVAR2S =sqrt(mean((cardiacTest[,"cmort"] - preds2S$fcst$cmort[,1])^2))RMSEVAR2SRMSE =sqrt(mean((cardiacTest[,"cmort"] - fore.mlp.cmort$mean)^2))RMSERMSEENSEMBLE =sqrt(mean((cardiacTest[,"cmort"] - ensemble)^2))RMSEENSEMBLE # 5.64, 5,81
Model Comparison and Final Forecasts
Provide a table comparing all models on at least ASE and rwRMSE (if available).
Include at least one ensemble model in addition to the models above.
Make a case as to why you feel one of your candidate models is the most useful.
Provide you final short and long term forecasts with that model.